{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {\n",
    "    'docs':[],\n",
    "    'labels':[]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ham', 'spam']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.listdir('email')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['1.txt',\n",
       " '10.txt',\n",
       " '11.txt',\n",
       " '12.txt',\n",
       " '13.txt',\n",
       " '14.txt',\n",
       " '15.txt',\n",
       " '16.txt',\n",
       " '17.txt',\n",
       " '18.txt',\n",
       " '19.txt',\n",
       " '2.txt',\n",
       " '20.txt',\n",
       " '21.txt',\n",
       " '22.txt',\n",
       " '23.txt',\n",
       " '24.txt',\n",
       " '25.txt',\n",
       " '3.txt',\n",
       " '4.txt',\n",
       " '5.txt',\n",
       " '6.txt',\n",
       " '7.txt',\n",
       " '8.txt',\n",
       " '9.txt']"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.listdir('email/spam')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "for label in os.listdir('email'):\n",
    "    for filename in os.listdir('email/'+label):\n",
    "        doc = open('email/'+label+'/'+filename,errors = 'ignore').read()\n",
    "        #数据切分\n",
    "        doc = [tok.lower() for tok in re.split(r'\\W',doc) if len(tok) > 1]\n",
    "        data['docs'].append(doc)\n",
    "        data['labels'].append(label)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bargains',\n",
       " 'here',\n",
       " 'buy',\n",
       " 'phentermin',\n",
       " '37',\n",
       " 'mg',\n",
       " '25',\n",
       " 'buy',\n",
       " 'genuine',\n",
       " 'phentermin',\n",
       " 'at',\n",
       " 'low',\n",
       " 'cost',\n",
       " 'visa',\n",
       " 'accepted',\n",
       " '30',\n",
       " '130',\n",
       " '50',\n",
       " '60',\n",
       " '219',\n",
       " '00',\n",
       " '90',\n",
       " '292',\n",
       " '50',\n",
       " '120',\n",
       " '366',\n",
       " '00',\n",
       " '180',\n",
       " '513',\n",
       " '00']"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['hi',\n",
       "  'peter',\n",
       "  'with',\n",
       "  'jose',\n",
       "  'out',\n",
       "  'of',\n",
       "  'town',\n",
       "  'do',\n",
       "  'you',\n",
       "  'want',\n",
       "  'to',\n",
       "  'meet',\n",
       "  'once',\n",
       "  'in',\n",
       "  'while',\n",
       "  'to',\n",
       "  'keep',\n",
       "  'things',\n",
       "  'going',\n",
       "  'and',\n",
       "  'do',\n",
       "  'some',\n",
       "  'interesting',\n",
       "  'stuff',\n",
       "  'let',\n",
       "  'me',\n",
       "  'know',\n",
       "  'eugene'],\n",
       " ['ryan',\n",
       "  'whybrew',\n",
       "  'commented',\n",
       "  'on',\n",
       "  'your',\n",
       "  'status',\n",
       "  'ryan',\n",
       "  'wrote',\n",
       "  'turd',\n",
       "  'ferguson',\n",
       "  'or',\n",
       "  'butt',\n",
       "  'horn'],\n",
       " ['arvind',\n",
       "  'thirumalai',\n",
       "  'commented',\n",
       "  'on',\n",
       "  'your',\n",
       "  'status',\n",
       "  'arvind',\n",
       "  'wrote',\n",
       "  'you',\n",
       "  'know',\n",
       "  'reply',\n",
       "  'to',\n",
       "  'this',\n",
       "  'email',\n",
       "  'to',\n",
       "  'comment',\n",
       "  'on',\n",
       "  'this',\n",
       "  'status'],\n",
       " ['thanks',\n",
       "  'peter',\n",
       "  'll',\n",
       "  'definitely',\n",
       "  'check',\n",
       "  'in',\n",
       "  'on',\n",
       "  'this',\n",
       "  'how',\n",
       "  'is',\n",
       "  'your',\n",
       "  'book',\n",
       "  'going',\n",
       "  'heard',\n",
       "  'chapter',\n",
       "  'came',\n",
       "  'in',\n",
       "  'and',\n",
       "  'it',\n",
       "  'was',\n",
       "  'in',\n",
       "  'good',\n",
       "  'shape',\n",
       "  'hope',\n",
       "  'you',\n",
       "  'are',\n",
       "  'doing',\n",
       "  'well',\n",
       "  'cheers',\n",
       "  'troy'],\n",
       " ['jay',\n",
       "  'stepp',\n",
       "  'commented',\n",
       "  'on',\n",
       "  'your',\n",
       "  'status',\n",
       "  'jay',\n",
       "  'wrote',\n",
       "  'to',\n",
       "  'the',\n",
       "  'reply',\n",
       "  'to',\n",
       "  'this',\n",
       "  'email',\n",
       "  'to',\n",
       "  'comment',\n",
       "  'on',\n",
       "  'this',\n",
       "  'status',\n",
       "  'to',\n",
       "  'see',\n",
       "  'the',\n",
       "  'comment',\n",
       "  'thread',\n",
       "  'follow',\n",
       "  'the',\n",
       "  'link',\n",
       "  'below'],\n",
       " ['linkedin',\n",
       "  'kerry',\n",
       "  'haloney',\n",
       "  'requested',\n",
       "  'to',\n",
       "  'add',\n",
       "  'you',\n",
       "  'as',\n",
       "  'connection',\n",
       "  'on',\n",
       "  'linkedin',\n",
       "  'peter',\n",
       "  'like',\n",
       "  'to',\n",
       "  'add',\n",
       "  'you',\n",
       "  'to',\n",
       "  'my',\n",
       "  'professional',\n",
       "  'network',\n",
       "  'on',\n",
       "  'linkedin',\n",
       "  'kerry',\n",
       "  'haloney'],\n",
       " ['hi',\n",
       "  'peter',\n",
       "  'the',\n",
       "  'hotels',\n",
       "  'are',\n",
       "  'the',\n",
       "  'ones',\n",
       "  'that',\n",
       "  'rent',\n",
       "  'out',\n",
       "  'the',\n",
       "  'tent',\n",
       "  'they',\n",
       "  'are',\n",
       "  'all',\n",
       "  'lined',\n",
       "  'up',\n",
       "  'on',\n",
       "  'the',\n",
       "  'hotel',\n",
       "  'grounds',\n",
       "  'so',\n",
       "  'much',\n",
       "  'for',\n",
       "  'being',\n",
       "  'one',\n",
       "  'with',\n",
       "  'nature',\n",
       "  'more',\n",
       "  'like',\n",
       "  'being',\n",
       "  'one',\n",
       "  'with',\n",
       "  'couple',\n",
       "  'dozen',\n",
       "  'tour',\n",
       "  'groups',\n",
       "  'and',\n",
       "  'nature',\n",
       "  'have',\n",
       "  'about',\n",
       "  '100m',\n",
       "  'of',\n",
       "  'pictures',\n",
       "  'from',\n",
       "  'that',\n",
       "  'trip',\n",
       "  'can',\n",
       "  'go',\n",
       "  'through',\n",
       "  'them',\n",
       "  'and',\n",
       "  'get',\n",
       "  'you',\n",
       "  'jpgs',\n",
       "  'of',\n",
       "  'my',\n",
       "  'favorite',\n",
       "  'scenic',\n",
       "  'pictures',\n",
       "  'where',\n",
       "  'are',\n",
       "  'you',\n",
       "  'and',\n",
       "  'jocelyn',\n",
       "  'now',\n",
       "  'new',\n",
       "  'york',\n",
       "  'will',\n",
       "  'you',\n",
       "  'come',\n",
       "  'to',\n",
       "  'tokyo',\n",
       "  'for',\n",
       "  'chinese',\n",
       "  'new',\n",
       "  'year',\n",
       "  'perhaps',\n",
       "  'to',\n",
       "  'see',\n",
       "  'the',\n",
       "  'two',\n",
       "  'of',\n",
       "  'you',\n",
       "  'then',\n",
       "  'will',\n",
       "  'go',\n",
       "  'to',\n",
       "  'thailand',\n",
       "  'for',\n",
       "  'winter',\n",
       "  'holiday',\n",
       "  'to',\n",
       "  'see',\n",
       "  'my',\n",
       "  'mom',\n",
       "  'take',\n",
       "  'care'],\n",
       " ['yeah',\n",
       "  'am',\n",
       "  'ready',\n",
       "  'may',\n",
       "  'not',\n",
       "  'be',\n",
       "  'here',\n",
       "  'because',\n",
       "  'jar',\n",
       "  'jar',\n",
       "  'has',\n",
       "  'plane',\n",
       "  'tickets',\n",
       "  'to',\n",
       "  'germany',\n",
       "  'for',\n",
       "  'me'],\n",
       " ['benoit',\n",
       "  'mandelbrot',\n",
       "  '1924',\n",
       "  '2010',\n",
       "  'benoit',\n",
       "  'mandelbrot',\n",
       "  '1924',\n",
       "  '2010',\n",
       "  'wilmott',\n",
       "  'team',\n",
       "  'benoit',\n",
       "  'mandelbrot',\n",
       "  'the',\n",
       "  'mathematician',\n",
       "  'the',\n",
       "  'father',\n",
       "  'of',\n",
       "  'fractal',\n",
       "  'mathematics',\n",
       "  'and',\n",
       "  'advocate',\n",
       "  'of',\n",
       "  'more',\n",
       "  'sophisticated',\n",
       "  'modelling',\n",
       "  'in',\n",
       "  'quantitative',\n",
       "  'finance',\n",
       "  'died',\n",
       "  'on',\n",
       "  '14th',\n",
       "  'october',\n",
       "  '2010',\n",
       "  'aged',\n",
       "  '85',\n",
       "  'wilmott',\n",
       "  'magazine',\n",
       "  'has',\n",
       "  'often',\n",
       "  'featured',\n",
       "  'mandelbrot',\n",
       "  'his',\n",
       "  'ideas',\n",
       "  'and',\n",
       "  'the',\n",
       "  'work',\n",
       "  'of',\n",
       "  'others',\n",
       "  'inspired',\n",
       "  'by',\n",
       "  'his',\n",
       "  'fundamental',\n",
       "  'insights',\n",
       "  'you',\n",
       "  'must',\n",
       "  'be',\n",
       "  'logged',\n",
       "  'on',\n",
       "  'to',\n",
       "  'view',\n",
       "  'these',\n",
       "  'articles',\n",
       "  'from',\n",
       "  'past',\n",
       "  'issues',\n",
       "  'of',\n",
       "  'wilmott',\n",
       "  'magazine'],\n",
       " ['hi',\n",
       "  'peter',\n",
       "  'sure',\n",
       "  'thing',\n",
       "  'sounds',\n",
       "  'good',\n",
       "  'let',\n",
       "  'me',\n",
       "  'know',\n",
       "  'what',\n",
       "  'time',\n",
       "  'would',\n",
       "  'be',\n",
       "  'good',\n",
       "  'for',\n",
       "  'you',\n",
       "  'will',\n",
       "  'come',\n",
       "  'prepared',\n",
       "  'with',\n",
       "  'some',\n",
       "  'ideas',\n",
       "  'and',\n",
       "  'we',\n",
       "  'can',\n",
       "  'go',\n",
       "  'from',\n",
       "  'there',\n",
       "  'regards',\n",
       "  'vivek'],\n",
       " ['linkedin',\n",
       "  'julius',\n",
       "  'requested',\n",
       "  'to',\n",
       "  'add',\n",
       "  'you',\n",
       "  'as',\n",
       "  'connection',\n",
       "  'on',\n",
       "  'linkedin',\n",
       "  'hi',\n",
       "  'peter',\n",
       "  'looking',\n",
       "  'forward',\n",
       "  'to',\n",
       "  'the',\n",
       "  'book',\n",
       "  'accept',\n",
       "  'view',\n",
       "  'invitation',\n",
       "  'from',\n",
       "  'julius'],\n",
       " ['yay',\n",
       "  'to',\n",
       "  'you',\n",
       "  'both',\n",
       "  'doing',\n",
       "  'fine',\n",
       "  'working',\n",
       "  'on',\n",
       "  'an',\n",
       "  'mba',\n",
       "  'in',\n",
       "  'design',\n",
       "  'strategy',\n",
       "  'at',\n",
       "  'cca',\n",
       "  'top',\n",
       "  'art',\n",
       "  'school',\n",
       "  'it',\n",
       "  'new',\n",
       "  'program',\n",
       "  'focusing',\n",
       "  'on',\n",
       "  'more',\n",
       "  'of',\n",
       "  'right',\n",
       "  'brained',\n",
       "  'creative',\n",
       "  'and',\n",
       "  'strategic',\n",
       "  'approach',\n",
       "  'to',\n",
       "  'management',\n",
       "  'an',\n",
       "  'of',\n",
       "  'the',\n",
       "  'way',\n",
       "  'done',\n",
       "  'today'],\n",
       " ['ve',\n",
       "  'thought',\n",
       "  'about',\n",
       "  'this',\n",
       "  'and',\n",
       "  'think',\n",
       "  'it',\n",
       "  'possible',\n",
       "  'we',\n",
       "  'should',\n",
       "  'get',\n",
       "  'another',\n",
       "  'lunch',\n",
       "  'have',\n",
       "  'car',\n",
       "  'now',\n",
       "  'and',\n",
       "  'could',\n",
       "  'come',\n",
       "  'pick',\n",
       "  'you',\n",
       "  'up',\n",
       "  'this',\n",
       "  'time',\n",
       "  'does',\n",
       "  'this',\n",
       "  'wednesday',\n",
       "  'work',\n",
       "  '11',\n",
       "  '50',\n",
       "  'can',\n",
       "  'have',\n",
       "  'signed',\n",
       "  'copy',\n",
       "  'of',\n",
       "  'you',\n",
       "  'book'],\n",
       " ['we',\n",
       "  'saw',\n",
       "  'this',\n",
       "  'on',\n",
       "  'the',\n",
       "  'way',\n",
       "  'to',\n",
       "  'the',\n",
       "  'coast',\n",
       "  'thought',\n",
       "  'might',\n",
       "  'like',\n",
       "  'it',\n",
       "  'hangzhou',\n",
       "  'is',\n",
       "  'huge',\n",
       "  'one',\n",
       "  'day',\n",
       "  'wasn',\n",
       "  'enough',\n",
       "  'but',\n",
       "  'we',\n",
       "  'got',\n",
       "  'glimpse',\n",
       "  'we',\n",
       "  'went',\n",
       "  'inside',\n",
       "  'the',\n",
       "  'china',\n",
       "  'pavilion',\n",
       "  'at',\n",
       "  'expo',\n",
       "  'it',\n",
       "  'is',\n",
       "  'pretty',\n",
       "  'interesting',\n",
       "  'each',\n",
       "  'province',\n",
       "  'has',\n",
       "  'an',\n",
       "  'exhibit'],\n",
       " ['hi',\n",
       "  'hommies',\n",
       "  'just',\n",
       "  'got',\n",
       "  'phone',\n",
       "  'call',\n",
       "  'from',\n",
       "  'the',\n",
       "  'roofer',\n",
       "  'they',\n",
       "  'will',\n",
       "  'come',\n",
       "  'and',\n",
       "  'spaying',\n",
       "  'the',\n",
       "  'foaming',\n",
       "  'today',\n",
       "  'it',\n",
       "  'will',\n",
       "  'be',\n",
       "  'dusty',\n",
       "  'pls',\n",
       "  'close',\n",
       "  'all',\n",
       "  'the',\n",
       "  'doors',\n",
       "  'and',\n",
       "  'windows',\n",
       "  'could',\n",
       "  'you',\n",
       "  'help',\n",
       "  'me',\n",
       "  'to',\n",
       "  'close',\n",
       "  'my',\n",
       "  'bathroom',\n",
       "  'window',\n",
       "  'cat',\n",
       "  'window',\n",
       "  'and',\n",
       "  'the',\n",
       "  'sliding',\n",
       "  'door',\n",
       "  'behind',\n",
       "  'the',\n",
       "  'tv',\n",
       "  'don',\n",
       "  'know',\n",
       "  'how',\n",
       "  'can',\n",
       "  'those',\n",
       "  'cats',\n",
       "  'survive',\n",
       "  'sorry',\n",
       "  'for',\n",
       "  'any',\n",
       "  'inconvenience'],\n",
       " ['scifinance',\n",
       "  'now',\n",
       "  'automatically',\n",
       "  'generates',\n",
       "  'gpu',\n",
       "  'enabled',\n",
       "  'pricing',\n",
       "  'risk',\n",
       "  'model',\n",
       "  'source',\n",
       "  'code',\n",
       "  'that',\n",
       "  'runs',\n",
       "  'up',\n",
       "  'to',\n",
       "  '50',\n",
       "  '300x',\n",
       "  'faster',\n",
       "  'than',\n",
       "  'serial',\n",
       "  'code',\n",
       "  'using',\n",
       "  'new',\n",
       "  'nvidia',\n",
       "  'fermi',\n",
       "  'class',\n",
       "  'tesla',\n",
       "  '20',\n",
       "  'series',\n",
       "  'gpu',\n",
       "  'scifinance',\n",
       "  'is',\n",
       "  'derivatives',\n",
       "  'pricing',\n",
       "  'and',\n",
       "  'risk',\n",
       "  'model',\n",
       "  'development',\n",
       "  'tool',\n",
       "  'that',\n",
       "  'automatically',\n",
       "  'generates',\n",
       "  'and',\n",
       "  'gpu',\n",
       "  'enabled',\n",
       "  'source',\n",
       "  'code',\n",
       "  'from',\n",
       "  'concise',\n",
       "  'high',\n",
       "  'level',\n",
       "  'model',\n",
       "  'specifications',\n",
       "  'no',\n",
       "  'parallel',\n",
       "  'computing',\n",
       "  'or',\n",
       "  'cuda',\n",
       "  'programming',\n",
       "  'expertise',\n",
       "  'is',\n",
       "  'required',\n",
       "  'scifinance',\n",
       "  'automatic',\n",
       "  'gpu',\n",
       "  'enabled',\n",
       "  'monte',\n",
       "  'carlo',\n",
       "  'pricing',\n",
       "  'model',\n",
       "  'source',\n",
       "  'code',\n",
       "  'generation',\n",
       "  'capabilities',\n",
       "  'have',\n",
       "  'been',\n",
       "  'significantly',\n",
       "  'extended',\n",
       "  'in',\n",
       "  'the',\n",
       "  'latest',\n",
       "  'release',\n",
       "  'this',\n",
       "  'includes'],\n",
       " ['ok', 'will', 'be', 'there', 'by', '10', '00', 'at', 'the', 'latest'],\n",
       " ['that',\n",
       "  'is',\n",
       "  'cold',\n",
       "  'is',\n",
       "  'there',\n",
       "  'going',\n",
       "  'to',\n",
       "  'be',\n",
       "  'retirement',\n",
       "  'party',\n",
       "  'are',\n",
       "  'the',\n",
       "  'leaves',\n",
       "  'changing',\n",
       "  'color'],\n",
       " ['what',\n",
       "  'is',\n",
       "  'going',\n",
       "  'on',\n",
       "  'there',\n",
       "  'talked',\n",
       "  'to',\n",
       "  'john',\n",
       "  'on',\n",
       "  'email',\n",
       "  'we',\n",
       "  'talked',\n",
       "  'about',\n",
       "  'some',\n",
       "  'computer',\n",
       "  'stuff',\n",
       "  'that',\n",
       "  'it',\n",
       "  'went',\n",
       "  'bike',\n",
       "  'riding',\n",
       "  'in',\n",
       "  'the',\n",
       "  'rain',\n",
       "  'it',\n",
       "  'was',\n",
       "  'not',\n",
       "  'that',\n",
       "  'cold',\n",
       "  'we',\n",
       "  'went',\n",
       "  'to',\n",
       "  'the',\n",
       "  'museum',\n",
       "  'in',\n",
       "  'sf',\n",
       "  'yesterday',\n",
       "  'it',\n",
       "  'was',\n",
       "  'to',\n",
       "  'get',\n",
       "  'in',\n",
       "  'and',\n",
       "  'they',\n",
       "  'had',\n",
       "  'free',\n",
       "  'food',\n",
       "  'at',\n",
       "  'the',\n",
       "  'same',\n",
       "  'time',\n",
       "  'was',\n",
       "  'sf',\n",
       "  'giants',\n",
       "  'game',\n",
       "  'when',\n",
       "  'we',\n",
       "  'got',\n",
       "  'done',\n",
       "  'we',\n",
       "  'had',\n",
       "  'to',\n",
       "  'take',\n",
       "  'the',\n",
       "  'train',\n",
       "  'with',\n",
       "  'all',\n",
       "  'the',\n",
       "  'giants',\n",
       "  'fans',\n",
       "  'they',\n",
       "  'are',\n",
       "  'drunk'],\n",
       " ['yo',\n",
       "  've',\n",
       "  'been',\n",
       "  'working',\n",
       "  'on',\n",
       "  'my',\n",
       "  'running',\n",
       "  'website',\n",
       "  'using',\n",
       "  'jquery',\n",
       "  'and',\n",
       "  'the',\n",
       "  'jqplot',\n",
       "  'plugin',\n",
       "  'not',\n",
       "  'too',\n",
       "  'far',\n",
       "  'away',\n",
       "  'from',\n",
       "  'having',\n",
       "  'prototype',\n",
       "  'to',\n",
       "  'launch',\n",
       "  'you',\n",
       "  'used',\n",
       "  'jqplot',\n",
       "  'right',\n",
       "  'if',\n",
       "  'not',\n",
       "  'think',\n",
       "  'you',\n",
       "  'would',\n",
       "  'like',\n",
       "  'it'],\n",
       " ['there',\n",
       "  'was',\n",
       "  'guy',\n",
       "  'at',\n",
       "  'the',\n",
       "  'gas',\n",
       "  'station',\n",
       "  'who',\n",
       "  'told',\n",
       "  'me',\n",
       "  'that',\n",
       "  'if',\n",
       "  'knew',\n",
       "  'mandarin',\n",
       "  'and',\n",
       "  'python',\n",
       "  'could',\n",
       "  'get',\n",
       "  'job',\n",
       "  'with',\n",
       "  'the',\n",
       "  'fbi'],\n",
       " ['hello',\n",
       "  'since',\n",
       "  'you',\n",
       "  'are',\n",
       "  'an',\n",
       "  'owner',\n",
       "  'of',\n",
       "  'at',\n",
       "  'least',\n",
       "  'one',\n",
       "  'google',\n",
       "  'groups',\n",
       "  'group',\n",
       "  'that',\n",
       "  'uses',\n",
       "  'the',\n",
       "  'customized',\n",
       "  'welcome',\n",
       "  'message',\n",
       "  'pages',\n",
       "  'or',\n",
       "  'files',\n",
       "  'we',\n",
       "  'are',\n",
       "  'writing',\n",
       "  'to',\n",
       "  'inform',\n",
       "  'you',\n",
       "  'that',\n",
       "  'we',\n",
       "  'will',\n",
       "  'no',\n",
       "  'longer',\n",
       "  'be',\n",
       "  'supporting',\n",
       "  'these',\n",
       "  'features',\n",
       "  'starting',\n",
       "  'february',\n",
       "  '2011',\n",
       "  'we',\n",
       "  'made',\n",
       "  'this',\n",
       "  'decision',\n",
       "  'so',\n",
       "  'that',\n",
       "  'we',\n",
       "  'can',\n",
       "  'focus',\n",
       "  'on',\n",
       "  'improving',\n",
       "  'the',\n",
       "  'core',\n",
       "  'functionalities',\n",
       "  'of',\n",
       "  'google',\n",
       "  'groups',\n",
       "  'mailing',\n",
       "  'lists',\n",
       "  'and',\n",
       "  'forum',\n",
       "  'discussions',\n",
       "  'instead',\n",
       "  'of',\n",
       "  'these',\n",
       "  'features',\n",
       "  'we',\n",
       "  'encourage',\n",
       "  'you',\n",
       "  'to',\n",
       "  'use',\n",
       "  'products',\n",
       "  'that',\n",
       "  'are',\n",
       "  'designed',\n",
       "  'specifically',\n",
       "  'for',\n",
       "  'file',\n",
       "  'storage',\n",
       "  'and',\n",
       "  'page',\n",
       "  'creation',\n",
       "  'such',\n",
       "  'as',\n",
       "  'google',\n",
       "  'docs',\n",
       "  'and',\n",
       "  'google',\n",
       "  'sites',\n",
       "  'for',\n",
       "  'example',\n",
       "  'you',\n",
       "  'can',\n",
       "  'easily',\n",
       "  'create',\n",
       "  'your',\n",
       "  'pages',\n",
       "  'on',\n",
       "  'google',\n",
       "  'sites',\n",
       "  'and',\n",
       "  'share',\n",
       "  'the',\n",
       "  'site',\n",
       "  'http',\n",
       "  'www',\n",
       "  'google',\n",
       "  'com',\n",
       "  'support',\n",
       "  'sites',\n",
       "  'bin',\n",
       "  'answer',\n",
       "  'py',\n",
       "  'hl',\n",
       "  'en',\n",
       "  'answer',\n",
       "  '174623',\n",
       "  'with',\n",
       "  'the',\n",
       "  'members',\n",
       "  'of',\n",
       "  'your',\n",
       "  'group',\n",
       "  'you',\n",
       "  'can',\n",
       "  'also',\n",
       "  'store',\n",
       "  'your',\n",
       "  'files',\n",
       "  'on',\n",
       "  'the',\n",
       "  'site',\n",
       "  'by',\n",
       "  'attaching',\n",
       "  'files',\n",
       "  'to',\n",
       "  'pages',\n",
       "  'http',\n",
       "  'www',\n",
       "  'google',\n",
       "  'com',\n",
       "  'support',\n",
       "  'sites',\n",
       "  'bin',\n",
       "  'answer',\n",
       "  'py',\n",
       "  'hl',\n",
       "  'en',\n",
       "  'answer',\n",
       "  '90563',\n",
       "  'on',\n",
       "  'the',\n",
       "  'site',\n",
       "  'if',\n",
       "  'you抮e',\n",
       "  'just',\n",
       "  'looking',\n",
       "  'for',\n",
       "  'place',\n",
       "  'to',\n",
       "  'upload',\n",
       "  'your',\n",
       "  'files',\n",
       "  'so',\n",
       "  'that',\n",
       "  'your',\n",
       "  'group',\n",
       "  'members',\n",
       "  'can',\n",
       "  'download',\n",
       "  'them',\n",
       "  'we',\n",
       "  'suggest',\n",
       "  'you',\n",
       "  'try',\n",
       "  'google',\n",
       "  'docs',\n",
       "  'you',\n",
       "  'can',\n",
       "  'upload',\n",
       "  'files',\n",
       "  'http',\n",
       "  'docs',\n",
       "  'google',\n",
       "  'com',\n",
       "  'support',\n",
       "  'bin',\n",
       "  'answer',\n",
       "  'py',\n",
       "  'hl',\n",
       "  'en',\n",
       "  'answer',\n",
       "  '50092',\n",
       "  'and',\n",
       "  'share',\n",
       "  'access',\n",
       "  'with',\n",
       "  'either',\n",
       "  'group',\n",
       "  'http',\n",
       "  'docs',\n",
       "  'google',\n",
       "  'com',\n",
       "  'support',\n",
       "  'bin',\n",
       "  'answer',\n",
       "  'py',\n",
       "  'hl',\n",
       "  'en',\n",
       "  'answer',\n",
       "  '66343',\n",
       "  'or',\n",
       "  'an',\n",
       "  'individual',\n",
       "  'http',\n",
       "  'docs',\n",
       "  'google',\n",
       "  'com',\n",
       "  'support',\n",
       "  'bin',\n",
       "  'answer',\n",
       "  'py',\n",
       "  'hl',\n",
       "  'en',\n",
       "  'answer',\n",
       "  '86152',\n",
       "  'assigning',\n",
       "  'either',\n",
       "  'edit',\n",
       "  'or',\n",
       "  'download',\n",
       "  'only',\n",
       "  'access',\n",
       "  'to',\n",
       "  'the',\n",
       "  'files',\n",
       "  'you',\n",
       "  'have',\n",
       "  'received',\n",
       "  'this',\n",
       "  'mandatory',\n",
       "  'email',\n",
       "  'service',\n",
       "  'announcement',\n",
       "  'to',\n",
       "  'update',\n",
       "  'you',\n",
       "  'about',\n",
       "  'important',\n",
       "  'changes',\n",
       "  'to',\n",
       "  'google',\n",
       "  'groups'],\n",
       " ['zach',\n",
       "  'hamm',\n",
       "  'commented',\n",
       "  'on',\n",
       "  'your',\n",
       "  'status',\n",
       "  'zach',\n",
       "  'wrote',\n",
       "  'doggy',\n",
       "  'style',\n",
       "  'enough',\n",
       "  'said',\n",
       "  'thank',\n",
       "  'you',\n",
       "  'good',\n",
       "  'night'],\n",
       " ['this',\n",
       "  'mail',\n",
       "  'was',\n",
       "  'sent',\n",
       "  'from',\n",
       "  'notification',\n",
       "  'only',\n",
       "  'address',\n",
       "  'that',\n",
       "  'cannot',\n",
       "  'accept',\n",
       "  'incoming',\n",
       "  'mail',\n",
       "  'please',\n",
       "  'do',\n",
       "  'not',\n",
       "  'reply',\n",
       "  'to',\n",
       "  'this',\n",
       "  'message',\n",
       "  'thank',\n",
       "  'you',\n",
       "  'for',\n",
       "  'your',\n",
       "  'online',\n",
       "  'reservation',\n",
       "  'the',\n",
       "  'store',\n",
       "  'you',\n",
       "  'selected',\n",
       "  'has',\n",
       "  'located',\n",
       "  'the',\n",
       "  'item',\n",
       "  'you',\n",
       "  'requested',\n",
       "  'and',\n",
       "  'has',\n",
       "  'placed',\n",
       "  'it',\n",
       "  'on',\n",
       "  'hold',\n",
       "  'in',\n",
       "  'your',\n",
       "  'name',\n",
       "  'please',\n",
       "  'note',\n",
       "  'that',\n",
       "  'all',\n",
       "  'items',\n",
       "  'are',\n",
       "  'held',\n",
       "  'for',\n",
       "  'day',\n",
       "  'please',\n",
       "  'note',\n",
       "  'store',\n",
       "  'prices',\n",
       "  'may',\n",
       "  'differ',\n",
       "  'from',\n",
       "  'those',\n",
       "  'online',\n",
       "  'if',\n",
       "  'you',\n",
       "  'have',\n",
       "  'questions',\n",
       "  'or',\n",
       "  'need',\n",
       "  'assistance',\n",
       "  'with',\n",
       "  'your',\n",
       "  'reservation',\n",
       "  'please',\n",
       "  'contact',\n",
       "  'the',\n",
       "  'store',\n",
       "  'at',\n",
       "  'the',\n",
       "  'phone',\n",
       "  'number',\n",
       "  'listed',\n",
       "  'below',\n",
       "  'you',\n",
       "  'can',\n",
       "  'also',\n",
       "  'access',\n",
       "  'store',\n",
       "  'information',\n",
       "  'such',\n",
       "  'as',\n",
       "  'store',\n",
       "  'hours',\n",
       "  'and',\n",
       "  'location',\n",
       "  'on',\n",
       "  'the',\n",
       "  'web',\n",
       "  'at',\n",
       "  'http',\n",
       "  'www',\n",
       "  'borders',\n",
       "  'com',\n",
       "  'online',\n",
       "  'store',\n",
       "  'storedetailview_98'],\n",
       " ['hi',\n",
       "  'peter',\n",
       "  'these',\n",
       "  'are',\n",
       "  'the',\n",
       "  'only',\n",
       "  'good',\n",
       "  'scenic',\n",
       "  'ones',\n",
       "  'and',\n",
       "  'it',\n",
       "  'too',\n",
       "  'bad',\n",
       "  'there',\n",
       "  'was',\n",
       "  'girl',\n",
       "  'back',\n",
       "  'in',\n",
       "  'one',\n",
       "  'of',\n",
       "  'them',\n",
       "  'just',\n",
       "  'try',\n",
       "  'to',\n",
       "  'enjoy',\n",
       "  'the',\n",
       "  'blue',\n",
       "  'sky'],\n",
       " ['codeine',\n",
       "  '15mg',\n",
       "  '30',\n",
       "  'for',\n",
       "  '203',\n",
       "  '70',\n",
       "  'visa',\n",
       "  'only',\n",
       "  'codeine',\n",
       "  'methylmorphine',\n",
       "  'is',\n",
       "  'narcotic',\n",
       "  'opioid',\n",
       "  'pain',\n",
       "  'reliever',\n",
       "  'we',\n",
       "  'have',\n",
       "  '15mg',\n",
       "  '30mg',\n",
       "  'pills',\n",
       "  '30',\n",
       "  '15mg',\n",
       "  'for',\n",
       "  '203',\n",
       "  '70',\n",
       "  '60',\n",
       "  '15mg',\n",
       "  'for',\n",
       "  '385',\n",
       "  '80',\n",
       "  '90',\n",
       "  '15mg',\n",
       "  'for',\n",
       "  '562',\n",
       "  '50',\n",
       "  'visa',\n",
       "  'only'],\n",
       " ['ordercializviagra',\n",
       "  'online',\n",
       "  'save',\n",
       "  '75',\n",
       "  '90',\n",
       "  '0nline',\n",
       "  'pharmacy',\n",
       "  'noprescription',\n",
       "  'required',\n",
       "  'buy',\n",
       "  'canadian',\n",
       "  'drugs',\n",
       "  'at',\n",
       "  'wholesale',\n",
       "  'prices',\n",
       "  'and',\n",
       "  'save',\n",
       "  '75',\n",
       "  '90',\n",
       "  'fda',\n",
       "  'approved',\n",
       "  'drugs',\n",
       "  'superb',\n",
       "  'quality',\n",
       "  'drugs',\n",
       "  'only',\n",
       "  'accept',\n",
       "  'all',\n",
       "  'major',\n",
       "  'credit',\n",
       "  'cards'],\n",
       " ['you',\n",
       "  'have',\n",
       "  'everything',\n",
       "  'to',\n",
       "  'gain',\n",
       "  'incredib1e',\n",
       "  'gains',\n",
       "  'in',\n",
       "  'length',\n",
       "  'of',\n",
       "  'inches',\n",
       "  'to',\n",
       "  'yourpenis',\n",
       "  'permanantly',\n",
       "  'amazing',\n",
       "  'increase',\n",
       "  'in',\n",
       "  'thickness',\n",
       "  'of',\n",
       "  'yourpenis',\n",
       "  'up',\n",
       "  'to',\n",
       "  '30',\n",
       "  'betterejacu1ation',\n",
       "  'control',\n",
       "  'experience',\n",
       "  'rock',\n",
       "  'harderecetions',\n",
       "  'explosive',\n",
       "  'intenseorgasns',\n",
       "  'increase',\n",
       "  'volume',\n",
       "  'ofejacu1ate',\n",
       "  'doctor',\n",
       "  'designed',\n",
       "  'and',\n",
       "  'endorsed',\n",
       "  '100',\n",
       "  'herbal',\n",
       "  '100',\n",
       "  'natural',\n",
       "  '100',\n",
       "  'safe',\n",
       "  'the',\n",
       "  'proven',\n",
       "  'naturalpenisenhancement',\n",
       "  'that',\n",
       "  'works',\n",
       "  '100',\n",
       "  'moneyback',\n",
       "  'guaranteeed'],\n",
       " ['buy',\n",
       "  'ambiem',\n",
       "  'zolpidem',\n",
       "  '5mg',\n",
       "  '10mg',\n",
       "  '39',\n",
       "  'pill',\n",
       "  '30',\n",
       "  'pills',\n",
       "  'mg',\n",
       "  '129',\n",
       "  '00',\n",
       "  '60',\n",
       "  'pills',\n",
       "  'mg',\n",
       "  '199',\n",
       "  '20',\n",
       "  '180',\n",
       "  'pills',\n",
       "  'mg',\n",
       "  '430',\n",
       "  '20',\n",
       "  '30',\n",
       "  'pills',\n",
       "  '10',\n",
       "  'mg',\n",
       "  '138',\n",
       "  '00',\n",
       "  '120',\n",
       "  'pills',\n",
       "  '10',\n",
       "  'mg',\n",
       "  '322',\n",
       "  '80'],\n",
       " ['ordercializviagra',\n",
       "  'online',\n",
       "  'save',\n",
       "  '75',\n",
       "  '90',\n",
       "  '0nline',\n",
       "  'pharmacy',\n",
       "  'noprescription',\n",
       "  'required',\n",
       "  'buy',\n",
       "  'canadian',\n",
       "  'drugs',\n",
       "  'at',\n",
       "  'wholesale',\n",
       "  'prices',\n",
       "  'and',\n",
       "  'save',\n",
       "  '75',\n",
       "  '90',\n",
       "  'fda',\n",
       "  'approved',\n",
       "  'drugs',\n",
       "  'superb',\n",
       "  'quality',\n",
       "  'drugs',\n",
       "  'only',\n",
       "  'accept',\n",
       "  'all',\n",
       "  'major',\n",
       "  'credit',\n",
       "  'cards',\n",
       "  'order',\n",
       "  'today',\n",
       "  'from',\n",
       "  '38'],\n",
       " ['buyviagra',\n",
       "  '25mg',\n",
       "  '50mg',\n",
       "  '100mg',\n",
       "  'brandviagra',\n",
       "  'femaleviagra',\n",
       "  'from',\n",
       "  '15',\n",
       "  'per',\n",
       "  'pill',\n",
       "  'viagranoprescription',\n",
       "  'needed',\n",
       "  'from',\n",
       "  'certified',\n",
       "  'canadian',\n",
       "  'pharmacy',\n",
       "  'buy',\n",
       "  'here',\n",
       "  'we',\n",
       "  'accept',\n",
       "  'visa',\n",
       "  'amex',\n",
       "  'check',\n",
       "  'worldwide',\n",
       "  'delivery'],\n",
       " ['you',\n",
       "  'have',\n",
       "  'everything',\n",
       "  'to',\n",
       "  'gain',\n",
       "  'incredib1e',\n",
       "  'gains',\n",
       "  'in',\n",
       "  'length',\n",
       "  'of',\n",
       "  'inches',\n",
       "  'to',\n",
       "  'yourpenis',\n",
       "  'permanantly',\n",
       "  'amazing',\n",
       "  'increase',\n",
       "  'in',\n",
       "  'thickness',\n",
       "  'of',\n",
       "  'yourpenis',\n",
       "  'up',\n",
       "  'to',\n",
       "  '30',\n",
       "  'betterejacu1ation',\n",
       "  'control',\n",
       "  'experience',\n",
       "  'rock',\n",
       "  'harderecetions',\n",
       "  'explosive',\n",
       "  'intenseorgasns',\n",
       "  'increase',\n",
       "  'volume',\n",
       "  'ofejacu1ate',\n",
       "  'doctor',\n",
       "  'designed',\n",
       "  'and',\n",
       "  'endorsed',\n",
       "  '100',\n",
       "  'herbal',\n",
       "  '100',\n",
       "  'natural',\n",
       "  '100',\n",
       "  'safe'],\n",
       " ['you',\n",
       "  'have',\n",
       "  'everything',\n",
       "  'to',\n",
       "  'gain',\n",
       "  'incredib1e',\n",
       "  'gains',\n",
       "  'in',\n",
       "  'length',\n",
       "  'of',\n",
       "  'inches',\n",
       "  'to',\n",
       "  'yourpenis',\n",
       "  'permanantly',\n",
       "  'amazing',\n",
       "  'increase',\n",
       "  'in',\n",
       "  'thickness',\n",
       "  'of',\n",
       "  'yourpenis',\n",
       "  'up',\n",
       "  'to',\n",
       "  '30',\n",
       "  'betterejacu1ation',\n",
       "  'control',\n",
       "  'experience',\n",
       "  'rock',\n",
       "  'harderecetions',\n",
       "  'explosive',\n",
       "  'intenseorgasns',\n",
       "  'increase',\n",
       "  'volume',\n",
       "  'ofejacu1ate',\n",
       "  'doctor',\n",
       "  'designed',\n",
       "  'and',\n",
       "  'endorsed',\n",
       "  '100',\n",
       "  'herbal',\n",
       "  '100',\n",
       "  'natural',\n",
       "  '100',\n",
       "  'safe'],\n",
       " ['home',\n",
       "  'based',\n",
       "  'business',\n",
       "  'opportunity',\n",
       "  'is',\n",
       "  'knocking',\n",
       "  'at',\n",
       "  'your',\n",
       "  'door',\n",
       "  'don抰',\n",
       "  'be',\n",
       "  'rude',\n",
       "  'and',\n",
       "  'let',\n",
       "  'this',\n",
       "  'chance',\n",
       "  'go',\n",
       "  'by',\n",
       "  'you',\n",
       "  'can',\n",
       "  'earn',\n",
       "  'great',\n",
       "  'income',\n",
       "  'and',\n",
       "  'find',\n",
       "  'your',\n",
       "  'financial',\n",
       "  'life',\n",
       "  'transformed',\n",
       "  'learn',\n",
       "  'more',\n",
       "  'here',\n",
       "  'to',\n",
       "  'your',\n",
       "  'success',\n",
       "  'work',\n",
       "  'from',\n",
       "  'home',\n",
       "  'finder',\n",
       "  'experts'],\n",
       " ['codeine',\n",
       "  'the',\n",
       "  'most',\n",
       "  'competitive',\n",
       "  'price',\n",
       "  'on',\n",
       "  'net',\n",
       "  'codeine',\n",
       "  'wilson',\n",
       "  '30mg',\n",
       "  '30',\n",
       "  '156',\n",
       "  '00',\n",
       "  'codeine',\n",
       "  'wilson',\n",
       "  '30mg',\n",
       "  '60',\n",
       "  '291',\n",
       "  '00',\n",
       "  'freeviagra',\n",
       "  'pills',\n",
       "  'codeine',\n",
       "  'wilson',\n",
       "  '30mg',\n",
       "  '90',\n",
       "  '396',\n",
       "  '00',\n",
       "  'freeviagra',\n",
       "  'pills',\n",
       "  'codeine',\n",
       "  'wilson',\n",
       "  '30mg',\n",
       "  '120',\n",
       "  '492',\n",
       "  '00',\n",
       "  '10',\n",
       "  'freeviagra',\n",
       "  'pills'],\n",
       " ['get',\n",
       "  'up',\n",
       "  'to',\n",
       "  '75',\n",
       "  'off',\n",
       "  'at',\n",
       "  'online',\n",
       "  'watchesstore',\n",
       "  'discount',\n",
       "  'watches',\n",
       "  'for',\n",
       "  'all',\n",
       "  'famous',\n",
       "  'brands',\n",
       "  'watches',\n",
       "  'arolexbvlgari',\n",
       "  'dior',\n",
       "  'hermes',\n",
       "  'oris',\n",
       "  'cartier',\n",
       "  'ap',\n",
       "  'and',\n",
       "  'more',\n",
       "  'brands',\n",
       "  'louis',\n",
       "  'vuitton',\n",
       "  'bags',\n",
       "  'wallets',\n",
       "  'gucci',\n",
       "  'bags',\n",
       "  'tiffany',\n",
       "  'co',\n",
       "  'jewerly',\n",
       "  'enjoy',\n",
       "  'full',\n",
       "  'year',\n",
       "  'warranty',\n",
       "  'shipment',\n",
       "  'via',\n",
       "  'reputable',\n",
       "  'courier',\n",
       "  'fedex',\n",
       "  'ups',\n",
       "  'dhl',\n",
       "  'and',\n",
       "  'ems',\n",
       "  'speedpost',\n",
       "  'you',\n",
       "  'will',\n",
       "  '100',\n",
       "  'recieve',\n",
       "  'your',\n",
       "  'order',\n",
       "  'save',\n",
       "  'up',\n",
       "  'to',\n",
       "  '75',\n",
       "  'off',\n",
       "  'quality',\n",
       "  'watches'],\n",
       " ['hydrocodone',\n",
       "  'vicodin',\n",
       "  'es',\n",
       "  'brand',\n",
       "  'watson',\n",
       "  'vicodin',\n",
       "  'es',\n",
       "  '750',\n",
       "  'mg',\n",
       "  '30',\n",
       "  '195',\n",
       "  '120',\n",
       "  '570',\n",
       "  'brand',\n",
       "  'watson',\n",
       "  '750',\n",
       "  'mg',\n",
       "  '30',\n",
       "  '195',\n",
       "  '120',\n",
       "  '570',\n",
       "  'brand',\n",
       "  'watson',\n",
       "  '10',\n",
       "  '325',\n",
       "  'mg',\n",
       "  '30',\n",
       "  '199',\n",
       "  '120',\n",
       "  '588',\n",
       "  'noprescription',\n",
       "  'required',\n",
       "  'free',\n",
       "  'express',\n",
       "  'fedex',\n",
       "  'days',\n",
       "  'delivery',\n",
       "  'for',\n",
       "  'over',\n",
       "  '200',\n",
       "  'order',\n",
       "  'major',\n",
       "  'credit',\n",
       "  'cards',\n",
       "  'check'],\n",
       " ['get',\n",
       "  'up',\n",
       "  'to',\n",
       "  '75',\n",
       "  'off',\n",
       "  'at',\n",
       "  'online',\n",
       "  'watchesstore',\n",
       "  'discount',\n",
       "  'watches',\n",
       "  'for',\n",
       "  'all',\n",
       "  'famous',\n",
       "  'brands',\n",
       "  'watches',\n",
       "  'arolexbvlgari',\n",
       "  'dior',\n",
       "  'hermes',\n",
       "  'oris',\n",
       "  'cartier',\n",
       "  'ap',\n",
       "  'and',\n",
       "  'more',\n",
       "  'brands',\n",
       "  'louis',\n",
       "  'vuitton',\n",
       "  'bags',\n",
       "  'wallets',\n",
       "  'gucci',\n",
       "  'bags',\n",
       "  'tiffany',\n",
       "  'co',\n",
       "  'jewerly',\n",
       "  'enjoy',\n",
       "  'full',\n",
       "  'year',\n",
       "  'warranty',\n",
       "  'shipment',\n",
       "  'via',\n",
       "  'reputable',\n",
       "  'courier',\n",
       "  'fedex',\n",
       "  'ups',\n",
       "  'dhl',\n",
       "  'and',\n",
       "  'ems',\n",
       "  'speedpost',\n",
       "  'you',\n",
       "  'will',\n",
       "  '100',\n",
       "  'recieve',\n",
       "  'your',\n",
       "  'order'],\n",
       " ['percocet',\n",
       "  '10',\n",
       "  '625',\n",
       "  'mg',\n",
       "  'withoutprescription',\n",
       "  '30',\n",
       "  'tabs',\n",
       "  '225',\n",
       "  'percocet',\n",
       "  'narcotic',\n",
       "  'analgesic',\n",
       "  'is',\n",
       "  'used',\n",
       "  'to',\n",
       "  'treat',\n",
       "  'moderate',\n",
       "  'to',\n",
       "  'moderately',\n",
       "  'severepain',\n",
       "  'top',\n",
       "  'quality',\n",
       "  'express',\n",
       "  'shipping',\n",
       "  '100',\n",
       "  'safe',\n",
       "  'discreet',\n",
       "  'private',\n",
       "  'buy',\n",
       "  'cheap',\n",
       "  'percocet',\n",
       "  'online'],\n",
       " ['get',\n",
       "  'up',\n",
       "  'to',\n",
       "  '75',\n",
       "  'off',\n",
       "  'at',\n",
       "  'online',\n",
       "  'watchesstore',\n",
       "  'discount',\n",
       "  'watches',\n",
       "  'for',\n",
       "  'all',\n",
       "  'famous',\n",
       "  'brands',\n",
       "  'watches',\n",
       "  'arolexbvlgari',\n",
       "  'dior',\n",
       "  'hermes',\n",
       "  'oris',\n",
       "  'cartier',\n",
       "  'ap',\n",
       "  'and',\n",
       "  'more',\n",
       "  'brands',\n",
       "  'louis',\n",
       "  'vuitton',\n",
       "  'bags',\n",
       "  'wallets',\n",
       "  'gucci',\n",
       "  'bags',\n",
       "  'tiffany',\n",
       "  'co',\n",
       "  'jewerly',\n",
       "  'enjoy',\n",
       "  'full',\n",
       "  'year',\n",
       "  'warranty',\n",
       "  'shipment',\n",
       "  'via',\n",
       "  'reputable',\n",
       "  'courier',\n",
       "  'fedex',\n",
       "  'ups',\n",
       "  'dhl',\n",
       "  'and',\n",
       "  'ems',\n",
       "  'speedpost',\n",
       "  'you',\n",
       "  'will',\n",
       "  '100',\n",
       "  'recieve',\n",
       "  'your',\n",
       "  'order'],\n",
       " ['you',\n",
       "  'have',\n",
       "  'everything',\n",
       "  'to',\n",
       "  'gain',\n",
       "  'incredib1e',\n",
       "  'gains',\n",
       "  'in',\n",
       "  'length',\n",
       "  'of',\n",
       "  'inches',\n",
       "  'to',\n",
       "  'yourpenis',\n",
       "  'permanantly',\n",
       "  'amazing',\n",
       "  'increase',\n",
       "  'in',\n",
       "  'thickness',\n",
       "  'of',\n",
       "  'yourpenis',\n",
       "  'up',\n",
       "  'to',\n",
       "  '30',\n",
       "  'betterejacu1ation',\n",
       "  'control',\n",
       "  'experience',\n",
       "  'rock',\n",
       "  'harderecetions',\n",
       "  'explosive',\n",
       "  'intenseorgasns',\n",
       "  'increase',\n",
       "  'volume',\n",
       "  'ofejacu1ate',\n",
       "  'doctor',\n",
       "  'designed',\n",
       "  'and',\n",
       "  'endorsed',\n",
       "  '100',\n",
       "  'herbal',\n",
       "  '100',\n",
       "  'natural',\n",
       "  '100',\n",
       "  'safe'],\n",
       " ['you',\n",
       "  'have',\n",
       "  'everything',\n",
       "  'to',\n",
       "  'gain',\n",
       "  'incredib1e',\n",
       "  'gains',\n",
       "  'in',\n",
       "  'length',\n",
       "  'of',\n",
       "  'inches',\n",
       "  'to',\n",
       "  'yourpenis',\n",
       "  'permanantly',\n",
       "  'amazing',\n",
       "  'increase',\n",
       "  'in',\n",
       "  'thickness',\n",
       "  'of',\n",
       "  'yourpenis',\n",
       "  'up',\n",
       "  'to',\n",
       "  '30',\n",
       "  'betterejacu1ation',\n",
       "  'control',\n",
       "  'experience',\n",
       "  'rock',\n",
       "  'harderecetions',\n",
       "  'explosive',\n",
       "  'intenseorgasns',\n",
       "  'increase',\n",
       "  'volume',\n",
       "  'ofejacu1ate',\n",
       "  'doctor',\n",
       "  'designed',\n",
       "  'and',\n",
       "  'endorsed',\n",
       "  '100',\n",
       "  'herbal',\n",
       "  '100',\n",
       "  'natural',\n",
       "  '100',\n",
       "  'safe'],\n",
       " ['experience',\n",
       "  'with',\n",
       "  'biggerpenis',\n",
       "  'today',\n",
       "  'grow',\n",
       "  'inches',\n",
       "  'more',\n",
       "  'the',\n",
       "  'safest',\n",
       "  'most',\n",
       "  'effective',\n",
       "  'methods',\n",
       "  'of_penisen1argement',\n",
       "  'save',\n",
       "  'your',\n",
       "  'time',\n",
       "  'and',\n",
       "  'money',\n",
       "  'bettererections',\n",
       "  'with',\n",
       "  'effective',\n",
       "  'ma1eenhancement',\n",
       "  'products',\n",
       "  'ma1eenhancement',\n",
       "  'supplement',\n",
       "  'trusted',\n",
       "  'by',\n",
       "  'millions',\n",
       "  'buy',\n",
       "  'today'],\n",
       " ['you',\n",
       "  'have',\n",
       "  'everything',\n",
       "  'to',\n",
       "  'gain',\n",
       "  'incredib1e',\n",
       "  'gains',\n",
       "  'in',\n",
       "  'length',\n",
       "  'of',\n",
       "  'inches',\n",
       "  'to',\n",
       "  'yourpenis',\n",
       "  'permanantly',\n",
       "  'amazing',\n",
       "  'increase',\n",
       "  'in',\n",
       "  'thickness',\n",
       "  'of',\n",
       "  'yourpenis',\n",
       "  'up',\n",
       "  'to',\n",
       "  '30',\n",
       "  'betterejacu1ation',\n",
       "  'control',\n",
       "  'experience',\n",
       "  'rock',\n",
       "  'harderecetions',\n",
       "  'explosive',\n",
       "  'intenseorgasns',\n",
       "  'increase',\n",
       "  'volume',\n",
       "  'ofejacu1ate',\n",
       "  'doctor',\n",
       "  'designed',\n",
       "  'and',\n",
       "  'endorsed',\n",
       "  '100',\n",
       "  'herbal',\n",
       "  '100',\n",
       "  'natural',\n",
       "  '100',\n",
       "  'safe',\n",
       "  'the',\n",
       "  'proven',\n",
       "  'naturalpenisenhancement',\n",
       "  'that',\n",
       "  'works',\n",
       "  '100',\n",
       "  'moneyback',\n",
       "  'guaranteeed'],\n",
       " ['percocet',\n",
       "  '10',\n",
       "  '625',\n",
       "  'mg',\n",
       "  'withoutprescription',\n",
       "  '30',\n",
       "  'tabs',\n",
       "  '225',\n",
       "  'percocet',\n",
       "  'narcotic',\n",
       "  'analgesic',\n",
       "  'is',\n",
       "  'used',\n",
       "  'to',\n",
       "  'treat',\n",
       "  'moderate',\n",
       "  'to',\n",
       "  'moderately',\n",
       "  'severepain',\n",
       "  'top',\n",
       "  'quality',\n",
       "  'express',\n",
       "  'shipping',\n",
       "  '100',\n",
       "  'safe',\n",
       "  'discreet',\n",
       "  'private',\n",
       "  'buy',\n",
       "  'cheap',\n",
       "  'percocet',\n",
       "  'online'],\n",
       " ['codeine',\n",
       "  '15mg',\n",
       "  '30',\n",
       "  'for',\n",
       "  '203',\n",
       "  '70',\n",
       "  'visa',\n",
       "  'only',\n",
       "  'codeine',\n",
       "  'methylmorphine',\n",
       "  'is',\n",
       "  'narcotic',\n",
       "  'opioid',\n",
       "  'pain',\n",
       "  'reliever',\n",
       "  'we',\n",
       "  'have',\n",
       "  '15mg',\n",
       "  '30mg',\n",
       "  'pills',\n",
       "  '30',\n",
       "  '15mg',\n",
       "  'for',\n",
       "  '203',\n",
       "  '70',\n",
       "  '60',\n",
       "  '15mg',\n",
       "  'for',\n",
       "  '385',\n",
       "  '80',\n",
       "  '90',\n",
       "  '15mg',\n",
       "  'for',\n",
       "  '562',\n",
       "  '50',\n",
       "  'visa',\n",
       "  'only'],\n",
       " ['oem',\n",
       "  'adobe',\n",
       "  'microsoft',\n",
       "  'softwares',\n",
       "  'fast',\n",
       "  'order',\n",
       "  'and',\n",
       "  'download',\n",
       "  'microsoft',\n",
       "  'office',\n",
       "  'professional',\n",
       "  'plus',\n",
       "  '2007',\n",
       "  '2010',\n",
       "  '129',\n",
       "  'microsoft',\n",
       "  'windows',\n",
       "  'ultimate',\n",
       "  '119',\n",
       "  'adobe',\n",
       "  'photoshop',\n",
       "  'cs5',\n",
       "  'extended',\n",
       "  'adobe',\n",
       "  'acrobat',\n",
       "  'pro',\n",
       "  'extended',\n",
       "  'windows',\n",
       "  'xp',\n",
       "  'professional',\n",
       "  'thousand',\n",
       "  'more',\n",
       "  'titles'],\n",
       " ['bargains',\n",
       "  'here',\n",
       "  'buy',\n",
       "  'phentermin',\n",
       "  '37',\n",
       "  'mg',\n",
       "  '25',\n",
       "  'buy',\n",
       "  'genuine',\n",
       "  'phentermin',\n",
       "  'at',\n",
       "  'low',\n",
       "  'cost',\n",
       "  'visa',\n",
       "  'accepted',\n",
       "  '30',\n",
       "  '130',\n",
       "  '50',\n",
       "  '60',\n",
       "  '219',\n",
       "  '00',\n",
       "  '90',\n",
       "  '292',\n",
       "  '50',\n",
       "  '120',\n",
       "  '366',\n",
       "  '00',\n",
       "  '180',\n",
       "  '513',\n",
       "  '00'],\n",
       " ['you',\n",
       "  'have',\n",
       "  'everything',\n",
       "  'to',\n",
       "  'gain',\n",
       "  'incredib1e',\n",
       "  'gains',\n",
       "  'in',\n",
       "  'length',\n",
       "  'of',\n",
       "  'inches',\n",
       "  'to',\n",
       "  'yourpenis',\n",
       "  'permanantly',\n",
       "  'amazing',\n",
       "  'increase',\n",
       "  'in',\n",
       "  'thickness',\n",
       "  'of',\n",
       "  'yourpenis',\n",
       "  'up',\n",
       "  'to',\n",
       "  '30',\n",
       "  'betterejacu1ation',\n",
       "  'control',\n",
       "  'experience',\n",
       "  'rock',\n",
       "  'harderecetions',\n",
       "  'explosive',\n",
       "  'intenseorgasns',\n",
       "  'increase',\n",
       "  'volume',\n",
       "  'ofejacu1ate',\n",
       "  'doctor',\n",
       "  'designed',\n",
       "  'and',\n",
       "  'endorsed',\n",
       "  '100',\n",
       "  'herbal',\n",
       "  '100',\n",
       "  'natural',\n",
       "  '100',\n",
       "  'safe'],\n",
       " ['bargains',\n",
       "  'here',\n",
       "  'buy',\n",
       "  'phentermin',\n",
       "  '37',\n",
       "  'mg',\n",
       "  '25',\n",
       "  'buy',\n",
       "  'genuine',\n",
       "  'phentermin',\n",
       "  'at',\n",
       "  'low',\n",
       "  'cost',\n",
       "  'visa',\n",
       "  'accepted',\n",
       "  '30',\n",
       "  '130',\n",
       "  '50',\n",
       "  '60',\n",
       "  '219',\n",
       "  '00',\n",
       "  '90',\n",
       "  '292',\n",
       "  '50',\n",
       "  '120',\n",
       "  '366',\n",
       "  '00',\n",
       "  '180',\n",
       "  '513',\n",
       "  '00']]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['docs']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getDocList(docs):\n",
    "    docSet = set([])\n",
    "    for doc in docs:\n",
    "        docSet = set(doc) | docSet\n",
    "    doclist = list(docSet)\n",
    "    doclist.sort()\n",
    "    return doclist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "docList = getDocList(data['docs'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def doc2V(doc,docList):\n",
    "    dims = len(docList)\n",
    "    doc_v = np.zeros(dims)\n",
    "    for word in doc:\n",
    "        if word in doc:\n",
    "            doc_v[docList.index(word)] +=1\n",
    "    return doc_v"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "docs_v = list(map(lambda doc:doc2V(doc,docList),data['docs']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def train(docs_v,labels,docList):\n",
    "    n_doc = len(labels)\n",
    "    docLen = len(docList)\n",
    "    #防止0概率出现\n",
    "    p1num = np.ones(docLen)\n",
    "    p0num = np.ones(docLen)\n",
    "    p1Denom,p0Denom = 2, 2\n",
    "    #遍历所有的doc向量\n",
    "    for i in range(n_doc):\n",
    "        if labels[i] == 'spam':\n",
    "            p1num += docs_v[i]\n",
    "            p1Denom += np.sum(docs_v[i])\n",
    "        elif labels[i] =='ham':\n",
    "            p0num += docs_v[i]\n",
    "            p0Denom += np.sum(docs_v[i])\n",
    "    return np.log(p1num/p1Denom),np.log(p0num/p0Denom),labels.count('spam')/n_doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'ham',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam',\n",
       " 'spam']"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['labels']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "p1doc,p0doc,pA = train(docs_v,data['labels'],docList)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "def classify(doc,p1doc,p0doc,pA):\n",
    "    doc_v = doc2V(doc,docList)\n",
    "    p1 = np.sum(doc_v * p1doc) + np.log(pA)\n",
    "    p0 = np.sum(doc_v * p0doc) + np.log(1-pA)\n",
    "    if p1 > p0:\n",
    "        return 'spam'\n",
    "    else :\n",
    "        return 'ham'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ham'"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classify(data['docs'][24],p1doc,p0doc,pA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'spam'"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classify(data['docs'][30],p1doc,p0doc,pA)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a6dc62afd8b03c17538a9dfce2fcb18f62cec380cc7b77050462a64b7e4e4814"
  },
  "kernelspec": {
   "display_name": "Python 3.8.0 32-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
